packages for data cleaning and visualisation
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("dyplr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
read data file into easier name for reference and load data
data <- read.csv("titanic_vis_clean.csv")
get a global understanding of data structure and summary statistics
head(data)
## passengerid survived pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## name sex age sibsp parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male 28 0 0
## ticket fare embarked data.sex as.factor.sex. as.factor.survived.
## 1 A/5 21171 7.2500 S male male 0
## 2 PC 17599 71.2833 C female female 1
## 3 STON/O2. 3101282 7.9250 S female female 1
## 4 113803 53.1000 S female female 1
## 5 373450 8.0500 S male male 0
## 6 330877 8.4583 Q male male 0
## as.factor.embarked.
## 1 S
## 2 C
## 3 S
## 4 S
## 5 S
## 6 Q
summary(data)
## passengerid survived pclass name
## Min. : 1 Min. :0.0000 Min. :1.000 Length:889
## 1st Qu.:224 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446 Median :0.0000 Median :3.000 Mode :character
## Mean :446 Mean :0.3825 Mean :2.312
## 3rd Qu.:668 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891 Max. :1.0000 Max. :3.000
## sex age sibsp parch
## Length:889 Min. : 0.42 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:22.00 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.0000 Median :0.0000
## Mean :29.32 Mean :0.5242 Mean :0.3825
## 3rd Qu.:35.00 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.0000 Max. :6.0000
## ticket fare embarked data.sex
## Length:889 Min. : 0.000 Length:889 Length:889
## Class :character 1st Qu.: 7.896 Class :character Class :character
## Mode :character Median : 14.454 Mode :character Mode :character
## Mean : 32.097
## 3rd Qu.: 31.000
## Max. :512.329
## as.factor.sex. as.factor.survived. as.factor.embarked.
## Length:889 Min. :0.0000 Length:889
## Class :character 1st Qu.:0.0000 Class :character
## Mode :character Median :0.0000 Mode :character
## Mean :0.3825
## 3rd Qu.:1.0000
## Max. :1.0000
str(data)
## 'data.frame': 889 obs. of 15 variables:
## $ passengerid : int 1 2 3 4 5 6 7 8 9 10 ...
## $ survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ sex : chr "male" "female" "female" "female" ...
## $ age : num 22 38 26 35 35 28 54 2 27 14 ...
## $ sibsp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ embarked : chr "S" "C" "S" "S" ...
## $ data.sex : chr "male" "female" "female" "female" ...
## $ as.factor.sex. : chr "male" "female" "female" "female" ...
## $ as.factor.survived.: int 0 1 1 1 0 0 0 0 1 1 ...
## $ as.factor.embarked.: chr "S" "C" "S" "S" ...
Count of Survived passengers
# barplot
ggplot(data, aes(x = as.factor(survived)))+
geom_bar()+
xlab("survived")+
ylab("count")+
ggtitle("count of survived passengers")
Scatter plot of age distribution and fare price. Most fares are
clustered upto 100 fare price, with few outliers above 200 and 500 for
those aged between 20-40 years old.
# scatter plot
ggplot(data, aes(x = age, y = fare)) +
geom_point()
Histogram diplaying Age distribution and count. most passengers were
~25-40 years old
#histogram
ggplot(data, aes(x = age))+
geom_histogram(binwidth = 5, fill="blue", color="black")+
xlab("age")+
ylab("count")+
ggtitle("age distribution of passengers")
Boxlpot showing age distribution and survival status. Median age is
higher for those who did not survive (left box), indicating those
younger had a higher survival rate.
#boxplot
ggplot(data, aes(x = as.factor(survived), y = age))+
geom_boxplot()+
xlab("survived")+
ylab("age")+
ggtitle("age distribution and density by survival status")
Violin plot displaying age distribution and survival status. majority
were ~25-30 including those that survived and not survived
#violinplot
ggplot(data, aes(x = as.factor(survived), y = age))+
geom_violin()+
xlab("survived")+
ylab("age")+
ggtitle("age distribution and density by survival status")
bar plot of passenger class type and count
#bar plot of passenger class
ggplot(data, aes(x = as.factor(data$pclass)))+
geom_bar(fill = "green")+
xlab("passenger class")+
ylab("count")+
ggtitle("count of passenger class")
bar plot of Embarkation location and count. Cherbourge (c), Queeenstown
(Q), Southampton (S)
#bar plot of Embarked
ggplot(data, aes(x = embarked))+
geom_bar(fill = "lightblue")+
xlab("embarking point")+
ylab("count")+
ggtitle("count of passenger class")
Scatter plot shoing those who paid the highest fares had a higher chance of survival.
#scatterplot of age vs fare and survival
ggplot(data, aes(x = age, y = fare))+
geom_point()+
facet_grid(. ~ survived)+
xlab("age")+
ylab("fare")+
ggtitle("age vs fare by survival staus")
Scatter plot showing most passengers had a class 1 ticket
#scatterplot of age vs fare and Pclass
#scatterplot of age vs fare and Pclass
ggplot(data, aes(x = age, y = fare))+
geom_point(color = "red")+
facet_grid(. ~ pclass)+
xlab("age")+
ylab("fare")+
ggtitle("age vs fare by passenger class")
# Combined scatter plot of Age vs Fare by Pclass
ggplot(data, aes(x = age, y = fare, color = as.factor(pclass))) +
geom_point(size = 2) +
scale_color_manual(values = c("1" = "red", "2" = "orange", "3" = "green")) +
xlab("Age") +
ylab("Fare") +
ggtitle("Age vs. Fare by Passenger Class") +
labs(color = "Passenger Class")
installation of ‘plotly’ package for interactive stacked bar plots
install.packages("plotly")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
if (!requireNamespace("dplyr", quietly = TRUE)) {
install.packages("dplyr")
}
if (!requireNamespace("plotly", quietly = TRUE)) {
install.packages("plotly")
}
library(dplyr)
library(plotly)
data preperation for interactive bar plot by creating new features
data <- data %>%
group_by(pclass, survived) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(percentage = count / sum(count) * 100)
# Convert to data frame
data <- as.data.frame(data)
# Convert survived to factor
data$survived <- as.factor(data$survived)
# interactive stacked bar plot
plot <- plot_ly(data,
x = ~pclass,
y = ~percentage,
type = 'bar',
color = ~survived,
text = ~paste('Survived:', survived, '<br>Percentage:', round(percentage, 2), '%'),
hoverinfo = 'text',
textposition = 'auto') %>%
layout(barmode = 'stack',
xaxis = list(title = 'Passenger Class'),
yaxis = list(title = 'Percentage'),
title = 'Survival Proportions by Passenger Class',
legend = list(title = list(text = 'Survived')))
plot
scatter plot of age vs fare colored by embarkation with linear regression (lm).
# scatter plot of age vs fare colored by embarkation
data <- read.csv("titanic_vis_clean.csv")
# scatter plot of age vs fare colored by embarkation with linear regression (lm)
ggplot(data, aes (x = age, y = fare, color= embarked))+
geom_point()+
geom_smooth(method = "lm", col = "blue")+
xlab("age")+
ylab("fare")+
ggtitle("age and fare by embarkation")
## `geom_smooth()` using formula = 'y ~ x'